# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
sns.set()8 Logistic regression: Others
8.1 Decision Boundary in Classification
# Importing the dataset
dataset = pd.read_csv('datasets/apples_and_oranges.csv')
dataset.head()| Weight | Size | Class | |
|---|---|---|---|
| 0 | 69 | 4.39 | orange |
| 1 | 69 | 4.21 | orange |
| 2 | 65 | 4.09 | orange |
| 3 | 72 | 5.85 | apple |
| 4 | 67 | 4.70 | orange |
# No. of apples and oranges
dataset['Class'].value_counts()Class
orange 20
apple 20
Name: count, dtype: int64
8.1.1 Encoding Target
le = LabelEncoder()
dataset['Class'] = le.fit_transform(dataset['Class'])
le.classes_array(['apple', 'orange'], dtype=object)
This implies that, * 0 represents Apple * 1 represents Orange
dataset.head()| Weight | Size | Class | |
|---|---|---|---|
| 0 | 69 | 4.39 | 1 |
| 1 | 69 | 4.21 | 1 |
| 2 | 65 | 4.09 | 1 |
| 3 | 72 | 5.85 | 0 |
| 4 | 67 | 4.70 | 1 |
8.1.2 Plotting the dataset
plt.figure(figsize=(9,6))
plt.title('Apples and Oranges', fontweight='bold', fontsize=16)
plt.xlabel('Weight')
plt.ylabel('Size')
scatter = plt.scatter(dataset['Weight'], dataset['Size'], c=dataset['Class'], cmap='viridis')
plt.legend(*scatter.legend_elements(),
loc = 'upper left',
title = 'Class');_files/figure-html/cell-7-output-1.png)
We can observe that oranges have lower weight and size compared to apples. Further by drawing a straight line between these two groups of data points, we can clearly distinguish between apples and oranges.
8.1.3 Building a Logistic Regression model to distinguish apples and oranges
As we can clearly distinguish between apples and oranges using a straight line decision boundary, we can choose the hypothesis y = a0 + a1 x1 + a2 * x2* for Logistic Regression
where,
a0, a1, a2 are the fitting parameters
x1 is Weight
x2 is Size
# Defining target and features
y = dataset['Class']
x = dataset.drop(columns=['Class'])# Creating object of LogisticRegression class
log_reg = LogisticRegression()# Fitting parameters
log_reg.fit(x,y)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# Intercept - a0
log_reg.intercept_array([106.60287324])
# Coefficients - a1, a2 respectively
log_reg.coef_array([[-1.42833694, -1.31285258]])
# Predicting labels for the given dataset
label_predictions = log_reg.predict(x)
label_predictionsarray([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0])
8.1.4 Linear Decision Boundary with naive features
# Parameter values
a0 = log_reg.intercept_[0]
a1 = log_reg.coef_[0][0]
a2 = log_reg.coef_[0][1]# Defining x1 and x2 values for decision boundary
x1 = np.array([69, 71])
x2 = - (a0 / a2) - (a1 / a2) * x1# Plotting the decision boundary
plt.figure(figsize=(9,6))
plt.title('Apples and Oranges', fontweight='bold', fontsize=16)
plt.xlabel('Weight')
plt.ylabel('Size')
scatter = plt.scatter(dataset['Weight'], dataset['Size'], c=dataset['Class'], cmap='viridis')
plt.legend(*scatter.legend_elements(),
loc = 'upper left',
title = 'Class')
plt.plot(x1, x2, color='red', label='Decision Boundary')
plt.show()_files/figure-html/cell-16-output-1.png)
In this problem, we have just two features x1 and x2, if we use just those as they are we will end up with a straight line which divides our 2D plane into two half-planes.
8.2 Non-linear Decision Boundary
In some occasions, we want to have a more complex boundary, and we can achieve this by transforming our features. For instance, when confronted with a training data distribution as illustrated below,
it becomes imperative to generate polynomial features \[(x_1^2, x_2^2)\] in order to enhance the delineation between the two classes. _files/figure-html/68c4db82-2-image-3.png)
Let’s delve into a concrete example below to illustrate this concept.For the purpose of illustrating the decision boundary, I chose not to split the data into training and test sets.
#Load our Dataset for Logistic Regression
components = pd.read_csv('datasets/ex2data2.txt', header=None, names = ['feature 1', 'feature 2', 'faulty'])
components.head()| feature 1 | feature 2 | faulty | |
|---|---|---|---|
| 0 | 0.051267 | 0.69956 | 1 |
| 1 | -0.092742 | 0.68494 | 1 |
| 2 | -0.213710 | 0.69225 | 1 |
| 3 | -0.375000 | 0.50219 | 1 |
| 4 | -0.513250 | 0.46564 | 1 |
# check the balance of the dataset
components['faulty'].value_counts()faulty
0 60
1 58
Name: count, dtype: int64
# get positive and negative samples for plotting
pos = components['faulty'] == 1
neg = components['faulty'] == 0# Visualize Data
fig, axes = plt.subplots();
axes.set_xlabel('Feature 1')
axes.set_ylabel('Feature 2')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[pos, 'feature 2'], color = 'r', marker='x', label='Faulty')
axes.scatter(components.loc[neg, 'feature 1'], components.loc[neg, 'feature 2'], color = 'g', marker='o', label='Non Faculty')
axes.legend(title='Legend', loc = 'best' )
axes.set_xlim(-1,1.5)
axes.set_xlim(-1,1.5)_files/figure-html/cell-20-output-1.png)
As we can see that the positive and negative examples are not linearly seperable. So we have to add additional higher order polynomial features.
# define function to map higher order polynomial features
def mapFeature(X1, X2, degree):
res = np.ones(X1.shape[0])
for i in range(1,degree + 1):
for j in range(0,i + 1):
res = np.column_stack((res, (X1 ** (i-j)) * (X2 ** j)))
return res# Get the features
X = components.iloc[:, :2]degree = 2X_poly = mapFeature(X.iloc[:, 0], X.iloc[:, 1], degree)# Get the target variable
y = components.iloc[:, 2]def sigmoid(z):
return 1 / (1 + np.exp(-z))def costFunc(theta, X, y):
m = y.shape[0]
z = X.dot(theta)
h = sigmoid(z)
term1 = y * np.log(h)
term2 = (1- y) * np.log(1 - h)
J = -np.sum(term1 + term2, axis = 0) / m
return J # Set initial values for our parameters
initial_theta = np.zeros(X_poly.shape[1]).reshape(X_poly.shape[1], 1)# Now call the optimization routine
#NOTE: This automatically picks the learning rate
from scipy.optimize import minimize
res = minimize(costFunc, initial_theta.flatten(), args=(X_poly, y))# our optimizated coefficients
theta = res.x# define a function to plot the decision boundary
def plotDecisionBoundary(theta,degree, axes):
u = np.linspace(-1, 1.5, 50)
v = np.linspace(-1, 1.5, 50)
U,V = np.meshgrid(u,v)
# convert U, V to vectors for calculating additional features
# using vectorized implementation
U = np.ravel(U)
V = np.ravel(V)
Z = np.zeros((len(u) * len(v)))
X_poly = mapFeature(U, V, degree)
Z = X_poly.dot(theta)
# reshape U, V, Z back to matrix
U = U.reshape((len(u), len(v)))
V = V.reshape((len(u), len(v)))
Z = Z.reshape((len(u), len(v)))
cs = axes.contour(U,V,Z,levels=[0],cmap= "Greys_r")
axes.legend(labels=['Non Faculty', 'faulty', 'Decision Boundary'])
return cs# Plot Decision boundary
fig, axes = plt.subplots();
axes.set_xlabel('Feature 1')
axes.set_ylabel('Feature 2')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[pos, 'feature 2'], color = 'r', marker='x', label='Faulty')
axes.scatter(components.loc[neg, 'feature 1'], components.loc[neg, 'feature 2'], color = 'g', marker='o', label='Good')
#axes.legend(title='Legend', loc = 'best' )
plotDecisionBoundary(theta, degree, axes);_files/figure-html/cell-32-output-1.png)
of course, you can increase the degree of the polynomial you want to fit, but the overfitting could become a problem
# set degree = 6
degree = 6
# map features to the degree
X_poly = mapFeature(X.iloc[:, 0], X.iloc[:, 1], degree)
# set initial parameters
initial_theta = np.zeros(X_poly.shape[1]).reshape(X_poly.shape[1], 1)# Run the optimization function
res = minimize(costFunc, initial_theta.flatten(), args=(X_poly, y))
theta = res.x.reshape(res.x.shape[0], 1)
# Plot Decision boundary
fig, axes = plt.subplots()
axes.set_xlabel('Feature 1')
axes.set_ylabel('Feature 2')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[pos, 'feature 2'], color='r', marker='x', label='Faulty')
axes.scatter(components.loc[neg, 'feature 1'], components.loc[neg, 'feature 2'], color='g', marker='o', label='Good')
#axes.legend(title='Legend', loc='best')
plotDecisionBoundary(theta, degree, axes)_files/figure-html/cell-34-output-1.png)
As we can see the model tries pretty hard to capture every single example perfectly and overfits the data. This kind of model has overfitting issue. i.e The model has not pre-conceived notion about the seperation of the positive and negative examples and pretty much can fit any kind of data. Such model will fail in predicting the correct classification when it sees new examples.
One of techiques is to use regularization, which we will cover later. The idea is to penalize the algorithm when it tries to overfit by adding a regularization term to the cost function.
The New Cost function with the regularization is specified as
\(J(\theta ) = \frac{1}{m} \sum_{i=1}^m[-y_i log(h_\theta (z_i) – (1 – y_i) log(1-h_\theta (z_i))] + \frac{\lambda}{2m} \sum_{j=1}^n[\theta_j^2]\)
where \(\lambda\) = regularization factor
n = number of features.
(NOTE: The regularization term does include the intercept term \(\theta_0\)
8.2.1 By adding Polynomial Features
components| feature 1 | feature 2 | faulty | |
|---|---|---|---|
| 0 | 0.051267 | 0.699560 | 1 |
| 1 | -0.092742 | 0.684940 | 1 |
| 2 | -0.213710 | 0.692250 | 1 |
| 3 | -0.375000 | 0.502190 | 1 |
| 4 | -0.513250 | 0.465640 | 1 |
| ... | ... | ... | ... |
| 113 | -0.720620 | 0.538740 | 0 |
| 114 | -0.593890 | 0.494880 | 0 |
| 115 | -0.484450 | 0.999270 | 0 |
| 116 | -0.006336 | 0.999270 | 0 |
| 117 | 0.632650 | -0.030612 | 0 |
118 rows × 3 columns
X = components[['feature 1', 'feature 2']]
y = components['faulty']from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)feature_names = poly.get_feature_names_out()
print(feature_names)['1' 'feature 1' 'feature 2' 'feature 1^2' 'feature 1 feature 2'
'feature 2^2']
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_poly, y)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
label_predictions = model.predict(X_poly)8.2.1.1 Accuracy Score
accuracy_score(y, label_predictions)0.8135593220338984
8.2.1.2 Confusion Matrix
cm = confusion_matrix(y, label_predictions)
cmarray([[47, 13],
[ 9, 49]], dtype=int64)
8.2.2 By Transforming Continuous Variable
Variable transformation is an important technique to create robust models using logistic regression. Because the predictors are linear in the log of the odds, it is often helpful to transform the continuous variables to create a more linear relationship. To determine the best transformation of a continuous variable, a univariate plot is very helpful. Remember the nice univariate plot of Y variable against X variable in linear regression? This is not easily attained, because Y is dichotomous in logistic regression.
There are different recommended solutions. Among them, * One is to create several variations (in forms of squared, cubed, or logged transformations etc.). * Another solution is to break some continuous variables into segments and treat them as categorical variables. This may work well to pick up nonlinear trends. The biggest drawback is that it loses the benefit of the linear trend relationship in the curve1. It also may lead to over fitting.
train = pd.read_csv('./Datasets/Social_Network_Ads_train.csv') #Develop the model on train data
test = pd.read_csv('./Datasets/Social_Network_Ads_test.csv') #Test the model on test datatrain.head()| User ID | Gender | Age | EstimatedSalary | Purchased | |
|---|---|---|---|---|---|
| 0 | 15755018 | Male | 36 | 33000 | 0 |
| 1 | 15697020 | Female | 39 | 61000 | 0 |
| 2 | 15796351 | Male | 36 | 118000 | 1 |
| 3 | 15665760 | Male | 39 | 122000 | 1 |
| 4 | 15794661 | Female | 26 | 118000 | 0 |
train.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User ID 300 non-null int64
1 Gender 300 non-null object
2 Age 300 non-null int64
3 EstimatedSalary 300 non-null int64
4 Purchased 300 non-null int64
dtypes: int64(4), object(1)
memory usage: 11.8+ KB
train.Gender.value_counts()Gender
Female 151
Male 149
Name: count, dtype: int64
tmp_1 = pd.get_dummies(train['Gender'], drop_first=True)
train = pd.concat([train, tmp_1], axis=1)
train.head()| User ID | Gender | Age | EstimatedSalary | Purchased | Male | |
|---|---|---|---|---|---|---|
| 0 | 15755018 | Male | 36 | 33000 | 0 | True |
| 1 | 15697020 | Female | 39 | 61000 | 0 | False |
| 2 | 15796351 | Male | 36 | 118000 | 1 | True |
| 3 | 15665760 | Male | 39 | 122000 | 1 | True |
| 4 | 15794661 | Female | 26 | 118000 | 0 | False |
tmp_1 = pd.get_dummies(test['Gender'], drop_first=True)
test = pd.concat([test, tmp_1], axis=1)
test.head()| User ID | Gender | Age | EstimatedSalary | Purchased | Male | |
|---|---|---|---|---|---|---|
| 0 | 15810944 | Male | 35 | 20000 | 0 | True |
| 1 | 15668575 | Female | 26 | 43000 | 0 | False |
| 2 | 15603246 | Female | 27 | 57000 | 0 | False |
| 3 | 15694829 | Female | 32 | 150000 | 1 | False |
| 4 | 15697686 | Male | 29 | 80000 | 0 | True |
# Separating features and target on training set
y_train = train.Purchased
X_train = train.drop(["Purchased", "Gender", "User ID"], axis = 1)X_train| Age | EstimatedSalary | Male | |
|---|---|---|---|
| 0 | 36 | 33000 | True |
| 1 | 39 | 61000 | False |
| 2 | 36 | 118000 | True |
| 3 | 39 | 122000 | True |
| 4 | 26 | 118000 | False |
| ... | ... | ... | ... |
| 295 | 48 | 96000 | False |
| 296 | 42 | 149000 | True |
| 297 | 28 | 79000 | True |
| 298 | 51 | 134000 | False |
| 299 | 33 | 28000 | False |
300 rows × 3 columns
# Separating features and target on test set
y_test = test.Purchased
X_test = test.drop(["Purchased", "Gender", "User ID"], axis = 1)
X_test| Age | EstimatedSalary | Male | |
|---|---|---|---|
| 0 | 35 | 20000 | True |
| 1 | 26 | 43000 | False |
| 2 | 27 | 57000 | False |
| 3 | 32 | 150000 | False |
| 4 | 29 | 80000 | True |
| ... | ... | ... | ... |
| 95 | 49 | 39000 | False |
| 96 | 47 | 34000 | True |
| 97 | 60 | 42000 | True |
| 98 | 39 | 59000 | False |
| 99 | 51 | 23000 | True |
100 rows × 3 columns
from sklearn.linear_model import LogisticRegression
sklearn_model = LogisticRegression()
sklearn_model.fit(X_train, y_train)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
y_pred_test = sklearn_model.predict(X_test)
print('Accuracy of logistic regression on test set : {:.4f}'.format(accuracy_score(y_test, y_pred_test )))Accuracy of logistic regression on test set : 0.8800
8.2.2.1 Log transformation of salary
sns.histplot(train.EstimatedSalary)_files/figure-html/cell-54-output-1.png)
train["log_salary"] = np.log(train["EstimatedSalary"])
sns.histplot(train.log_salary)_files/figure-html/cell-55-output-1.png)
The reason for such transformations have nothing to do with their distribution. Instead, the reason has to do with the functional form of the effect. Say we want to know the effect of the number of publications on the probability of getting tenure. It is reasonable to believe that getting an extra publication when one has only 1 publication has more impact compared with getting an extra publication when one has already published 50 articles. The log transformation is one way to capture such a (testable) assumption of diminishing returns.
test["log_salary"] = np.log(test["EstimatedSalary"])# Separating features and target
y_train = train.Purchased
X_train = train.drop(["Purchased", "Gender", "User ID", "EstimatedSalary"], axis = 1)X_train| Age | Male | log_salary | |
|---|---|---|---|
| 0 | 36 | True | 10.404263 |
| 1 | 39 | False | 11.018629 |
| 2 | 36 | True | 11.678440 |
| 3 | 39 | True | 11.711776 |
| 4 | 26 | False | 11.678440 |
| ... | ... | ... | ... |
| 295 | 48 | False | 11.472103 |
| 296 | 42 | True | 11.911702 |
| 297 | 28 | True | 11.277203 |
| 298 | 51 | False | 11.805595 |
| 299 | 33 | False | 10.239960 |
300 rows × 3 columns
from sklearn.linear_model import LogisticRegression
sklearn_model_log = LogisticRegression()
sklearn_model_log.fit(X_train, y_train)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# Separating features and target for the test dataset
y_test_log = test.Purchased
X_test_log = test.drop(["Purchased", "Gender", "User ID", "EstimatedSalary"], axis = 1)y_log_pred_test = sklearn_model_log.predict(X_test_log)
print('Accuracy of logistic regression after log transformation of salary on test set : {:.4f}'.format(accuracy_score( y_test_log, y_log_pred_test)))Accuracy of logistic regression after log transformation of salary on test set : 0.8300
8.2.3 By Binning Continous Variables
sns.histplot(data=train.Age)_files/figure-html/cell-62-output-1.png)
bins = [train.Age.min()-1, 25, 35, 48, train.Age.max()]
labels = ['Big Kid', 'Young Adult', 'Adult', 'Senior']
train['AgeGroup'] = pd.cut(train["Age"], bins, labels = labels)
#draw a bar plot of Age vs. survival
sns.barplot(x="AgeGroup", y="Purchased", data=train)
plt.show()_files/figure-html/cell-63-output-1.png)
train| User ID | Gender | Age | EstimatedSalary | Purchased | Male | log_salary | AgeGroup | |
|---|---|---|---|---|---|---|---|---|
| 0 | 15755018 | Male | 36 | 33000 | 0 | True | 10.404263 | Adult |
| 1 | 15697020 | Female | 39 | 61000 | 0 | False | 11.018629 | Adult |
| 2 | 15796351 | Male | 36 | 118000 | 1 | True | 11.678440 | Adult |
| 3 | 15665760 | Male | 39 | 122000 | 1 | True | 11.711776 | Adult |
| 4 | 15794661 | Female | 26 | 118000 | 0 | False | 11.678440 | Young Adult |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 295 | 15724536 | Female | 48 | 96000 | 1 | False | 11.472103 | Adult |
| 296 | 15701537 | Male | 42 | 149000 | 1 | True | 11.911702 | Adult |
| 297 | 15807481 | Male | 28 | 79000 | 0 | True | 11.277203 | Young Adult |
| 298 | 15603942 | Female | 51 | 134000 | 0 | False | 11.805595 | Senior |
| 299 | 15690188 | Female | 33 | 28000 | 0 | False | 10.239960 | Young Adult |
300 rows × 8 columns
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
train['AgeGroup']= label_encoder.fit_transform(train['AgeGroup'])
train['AgeGroup'].unique() array([0, 3, 1, 2])
test['AgeGroup'] = pd.cut(test["Age"], bins, labels = labels)
test['AgeGroup']= label_encoder.fit_transform(test['AgeGroup']) train| User ID | Gender | Age | EstimatedSalary | Purchased | Male | log_salary | AgeGroup | |
|---|---|---|---|---|---|---|---|---|
| 0 | 15755018 | Male | 36 | 33000 | 0 | True | 10.404263 | 0 |
| 1 | 15697020 | Female | 39 | 61000 | 0 | False | 11.018629 | 0 |
| 2 | 15796351 | Male | 36 | 118000 | 1 | True | 11.678440 | 0 |
| 3 | 15665760 | Male | 39 | 122000 | 1 | True | 11.711776 | 0 |
| 4 | 15794661 | Female | 26 | 118000 | 0 | False | 11.678440 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 295 | 15724536 | Female | 48 | 96000 | 1 | False | 11.472103 | 0 |
| 296 | 15701537 | Male | 42 | 149000 | 1 | True | 11.911702 | 0 |
| 297 | 15807481 | Male | 28 | 79000 | 0 | True | 11.277203 | 3 |
| 298 | 15603942 | Female | 51 | 134000 | 0 | False | 11.805595 | 2 |
| 299 | 15690188 | Female | 33 | 28000 | 0 | False | 10.239960 | 3 |
300 rows × 8 columns
# Separating features and target on train set
y_train = train.Purchased
X_train = train.drop(["Purchased", "Gender", "User ID", "EstimatedSalary", "Age"], axis = 1)
X_train| Male | log_salary | AgeGroup | |
|---|---|---|---|
| 0 | True | 10.404263 | 0 |
| 1 | False | 11.018629 | 0 |
| 2 | True | 11.678440 | 0 |
| 3 | True | 11.711776 | 0 |
| 4 | False | 11.678440 | 3 |
| ... | ... | ... | ... |
| 295 | False | 11.472103 | 0 |
| 296 | True | 11.911702 | 0 |
| 297 | True | 11.277203 | 3 |
| 298 | False | 11.805595 | 2 |
| 299 | False | 10.239960 | 3 |
300 rows × 3 columns
from sklearn.linear_model import LogisticRegression
sklearn_model_bin = LogisticRegression()
sklearn_model_bin.fit(X_train, y_train)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# Separating features and target on test set
y_test = test.Purchased
X_test_bin = test.drop(["Purchased", "Gender", "User ID", "EstimatedSalary", "Age"], axis = 1)
X_test_bin| Male | log_salary | AgeGroup | |
|---|---|---|---|
| 0 | True | 9.903488 | 3 |
| 1 | False | 10.668955 | 3 |
| 2 | False | 10.950807 | 3 |
| 3 | False | 11.918391 | 3 |
| 4 | True | 11.289782 | 3 |
| ... | ... | ... | ... |
| 95 | False | 10.571317 | 2 |
| 96 | True | 10.434116 | 0 |
| 97 | True | 10.645425 | 2 |
| 98 | False | 10.985293 | 0 |
| 99 | True | 10.043249 | 2 |
100 rows × 3 columns
y_bin_pred_test = sklearn_model_bin.predict(X_test_bin)
print('Accuracy of logistic regression after age binning on test set : {:.4f}'.format(accuracy_score( y_test, y_bin_pred_test)))Accuracy of logistic regression after age binning on test set : 0.7100
8.3 Reference
- https://www.linkedin.com/pulse/generating-non-linear-decision-boundaries-using-logistic-d-urso/
- https://jermwatt.github.io/machine_learning_refined/notes/10_Nonlinear_intro/10_4_Twoclass.html
- https://www.kaggle.com/code/lzs0047/logistic-regression-non-linear-decision-boundary/edit
- https://www.kaggle.com/code/ashishrane7/logistic-regression-non-linear-decision-boundary/notebook